  
from word2vec.word2vec import WordTokenizer, CorpusData, Word2vec

# 语料文件的路径
CORPUS_DATA_PATH = './resources/corpus/text8.txt'
# 模型参数保存的路径
MODEL_DICT_PATH = './resources/model/word2vec-latest.pth'
# 正样本距离中心词的最大距离
#MAX_WINDOW_SIZE = 3
MAX_WINDOW_SIZE = 5
# 每个正样本对应的负样本数据
NEGATIVE_SAMPLE_NUM = 15
# 词汇表最大数目
MAX_VOCAB_SIZE = 10000
# 词向量的维度大小
EMBEDDING_SIZE = 100
# 迭代次数
EPOCH_NUM = 1
# 批次大小
BATCH_SIZE = 32
# 学习率
LEARNING_RATE = 0.2


TEST_LIST = ['two', 'america', 'computer', 'sun', 'china', 
             'cat', 'food', 'black', 'light', 'java', 
             'king', 'today', 'wind', 'water', 'morning']

with open(CORPUS_DATA_PATH, 'r', encoding='utf-8') as f:
    file_content = f.read()

word_tokenizer = WordTokenizer()
corpus_data = CorpusData(word_tokenizer, MAX_VOCAB_SIZE)
corpus_data.load_data(file_content)

word2vec = Word2vec(corpus_data, EMBEDDING_SIZE)
# 测试模型
word2vec.test_model(model_state_path = MODEL_DICT_PATH, 
                    test_words_list = TEST_LIST,
                    nearest_word_num = 10)
